################################################################################
##
## Purpose: This script cleans the raw hearings.
##
## Author: James Bisbee (james.h.bisbee@vanderbilt.edu)
##
##  - Inputs:
##    - ./data/raw/hearings/.*.txt: Raw data from https://www.govinfo.gov/app/collection/chrg
##    - ./data/prepped/hearings/converted_docs_new.csv: Saved intermediate file from this script
##    - ./data/prepped/hearings/toCheck_12_25_2021_jbfixed_new.csv: Manually prepared data from an output of this script (./data/prepped/hearings/toCheck_12_25_2021.csv)
## - Outputs:
##    - ./data/prepped/hearings/toCheck_12_25_2021.csv
##    - ./data/prepped/hearings/converted_docs_new.csv
##    - ./data/prepped/hearings/cleaned_docs.csv
##
##
## See associated log file for compute environment, package versions, 
##  and date of most recent run.
##
################################################################################
rm(list = ls())
gc()
require(tidyverse)
require(stringdist)

set.seed(123)

# Compute details
print(paste0('Compute environment from ',Sys.Date(),' run by Bisbee'))
if(Sys.info()['sysname'] == 'Windows') {
  ram_size = system("wmic MemoryChip get Capacity", intern = TRUE)[-1]
  model_name = system("wmic cpu get name", intern = TRUE)[2] # nocov
  vendor_id = system("wmic cpu get manufacturer", intern = TRUE)[2] # nocov
  
  print(list(ram = stringr::str_squish(ram_size)[1],
             vendor_id = stringr::str_squish(vendor_id),
             model_name = stringr::str_squish(model_name),
             no_of_cores = parallel::detectCores()))
} else if(Sys.info()['sysname'] == 'Linuxs') {
  splitted <- strsplit(system("ps -C rsession -o %cpu,%mem,pid,cmd", intern = TRUE), " ")
  df <- do.call(rbind, lapply(splitted[-1], 
                              function(x) data.frame(
                                cpu = as.numeric(x[2]),
                                mem = as.numeric(x[4]),
                                pid = as.numeric(x[5]),
                                cmd = paste(x[-c(1:5)], collapse = " "))))
  df
} else {
  cat("If not on Linux or Windows, you'll have to figure out your own solution to seeing the compute environment.")
}

sessionInfo()


fs <- list.files(path = './data/raw/hearings',pattern = '\\.txt')


split_pattern <- paste0('\\s{4}(?=(M(s|r(s)*)\\.|Chair(wo)*man|Senator|Dr\\.) [[:alpha:]]+((\\s|-)[[:alpha:]]+){0,1}( of (',
                        paste(state.name,collapse = '|'),
                        '))*\\.( |-))|(?=The Chairman\\.( |-))|(?=The Clerk\\.( |-))|\\[(Whereupon|Prepare)')
extract_pattern <- paste0('^(M(s|r(s)*)\\.|Chair(wo)*man|Senator|Dr\\.) [[:alpha:]]+((\\s|-)[[:alpha:]]+){0,1}( of (',
                          paste(state.name,collapse = '|'),
                          '))*\\.( |-)|The Chairman\\.( |-)|The Clerk\\.( |-)')
hearings <- NULL
for(doc in fs) {
  dat <- readLines(paste0('./data/raw/hearings/',doc))
  
  
  prepDat <- gsub('submitted by \\n  Senator Paul S\\. Sarbanes','submitted by Senator Paul S Sarbanes',
                  gsub('\\\n    He starts out,','\n    Senator Bennett. He starts out,',
                       gsub(' \\[(presiding|continuing)\\]','',
                            gsub('\\\n    Me. Yellen. ','\n    Ms. Yellen. ',
                                 gsub('\\\n    It is since we set the','\n    Ms. Yellen. It is since we set the',
                                      gsub('\\\n    Well, one flaw is that the clearinghouse estimates','\n    Ms. Yellen. Well, one flaw is that the clearinghouse estimates',
                                           gsub("Senator Shelby. That is prospective, isn't it","Senator Brown. That is prospective, isn't it",
                                                gsub('Mr\\. Greenspan\\. The lead story today is \\$450 billion','Mr. Sherman. The lead story today is $450 billion',
                                                     paste(dat,collapse = '\n')))))))))
  
  splits <- unlist(str_split(prepDat,split_pattern))
  speakers <- gsub('-$','',trimws(str_extract(splits,pattern = extract_pattern)))
  
  text <- as_tibble(data.frame(docID = doc,
                               speaker = speakers,
                               text = splits,stringsAsFactors = F))
  
  if(doc == 'fed2004-07-21.txt') {
    text <- text %>%
      mutate(speaker = ifelse(speaker == 'Ms. Velasquez.','Ms. Velazquez.',speaker))
  }
  text <- text %>%
    mutate(speaker = ifelse(speaker == 'Mr. Chairman.','The Chairman.',speaker))
  speakerNames <- unique(text$speaker)[order(unique(text$speaker))]
  
  lookup <- NULL
  for(spkr in speakerNames) {
    if(is.na(spkr)) { next}
    
    if(spkr == 'The Chairman.') {
      if(doc == 'fed2002-07-17.txt') {
        extract <- toupper(gsub('\\n\\n|\\s{5,}','',str_extract(splits[1],'\\n\\n\\s{5,}[A-Z].*?Chair(wo)*man\\n')))
      } else {
        extract <- toupper(gsub('\\n\\n|\\s{5,}','',str_extract(splits[1],'\\n\\n\\s{5,}[A-Z].*?Chair(wo)*man\\n\\n')))
      }
      tmp <- unlist(sapply(toupper(c(state.name,'Guam')),function(x) grepl(paste0(' ',x),extract)))
      state <- names(tmp)[which(tmp)]
      if(length(state) > 1) {
        state <- state[which(nchar(state) == max(nchar(state)))]
      }
      name <- gsub(',.*','',extract)
    } else {
      nm <- gsub(' \\w$','',toupper(trimws(gsub('(M(s|r(s)*)\\.|Chair(wo)*man|Senator)|Dr\\.|\\.','',gsub(' of .*','',spkr)))))
      pat <- paste0('(\\n|\\s+).*? ',nm,'( JR\\.| I+| ESQ\\.)*, .*?(\\s{2,}|\\n)')
      
      if(!grepl('C O N T E N T S.*',toupper(splits[1]))) { stop() }
      extract <- trimws(str_extract_all(gsub('C O N T E N T S.*','',toupper(splits[1])),pattern = pat)[[1]])
      if(spkr == 'Ms. Garcia of Texas.' & doc == 'fed2019-07-10.txt') {
        extract <- trimws(str_extract_all(gsub('C O N T E N T S.*','',toupper(splits[1])),pattern = paste0('(\\n|\\s+).*? ',nm,'( JR\\.| I+| ESQ\\.)*, .*'))[[1]])[2]
      }
      
      if(length(extract) == 0) {
        extract <- trimws(str_extract_all(toupper(splits[1]),pattern = pat)[[1]])
      }
      
      if(any(grepl('\\s{2,}',extract))) {
        extract[which(grepl('\\s{2,}',extract))] <- trimws(gsub('.*?\\s{2,}','',extract[which(grepl('\\s{2,}',extract))]))
      }
      if(length(extract) > 1 & any(grepl(paste(toupper(state.name),collapse = '|'),extract))) {
        extract <- extract[which(grepl(paste(toupper(state.name),collapse = '|'),extract))]
      }
      if(length(extract) > 1 & grepl(' of ',spkr)) {
        state <- gsub(' OF |\\.','',str_extract(toupper(spkr),' OF .*'))
        extract <- extract[which(grepl(toupper(gsub('.* of |\\.','',spkr)),extract))]
        name <- name <- gsub(',.*','',extract)
      } else if(length(extract) > 1) {
        intro <- text$text[min(which(text$speaker == spkr))-1]
        states <- str_extract(intro,state.name)
        states <- states[which(!is.na(states))]
        extract <- extract[which(grepl(paste0(' ',toupper(states)),extract))]
        
        tmp <- sapply(toupper(c(state.name,'Guam')),function(x) grepl(paste0(' ',x),extract))
        state <- names(tmp)[which(tmp)]
        if(length(state) > 1) {
          state <- state[which(nchar(state) == max(nchar(state)))]
        }
        name <- gsub(',.*','',extract)
      } else {
        if(length(extract) == 1) {
          if(grepl('(NORTH)$',extract)) {
            extract <- paste0(extract,' CAROLINA')
          }
          if(grepl('(WEST)$',extract)) {
            extract <- paste0(extract,' VIRGINIA')
          }
        }
        tmp <- unlist(sapply(toupper(c(state.name,'Guam')),function(x) grepl(paste0(' ',x),extract)))
        state <- names(tmp)[which(tmp)]
        if(length(state) > 1) {
          state <- state[which(nchar(state) == max(nchar(state)))]
        }
        name <- gsub(',.*','',extract)
      }
      
      if(length(state) > 1) {
        extract <- trimws(str_extract_all(gsub('.*PRESENT:|\\\n','',toupper(splits[1])),pattern = paste0(nm,' OF \\w+(\\s\\w+){0,1}'))[[1]])
        intro <- text$text[min(which(text$speaker == spkr))-1]
        states <- str_extract(intro,state.name)
        states <- states[which(!is.na(states))]
        state <- trimws(gsub(paste0(nm,'|OF '),'',extract[which(grepl(toupper(states),extract))]))
      }

      if(length(state) > 1) {
        if(spkr == 'Mrs. Maloney.') {
          state <- 'NEW YORK'
        } else if(spkr == 'Mr. Maloney.') {
          state <- 'CONNECTICUT'
        }
        
        if(spkr == 'Mr. Jones.') {
          state <- 'NORTH CAROLINA'
        } else if(spkr == 'Mrs. Jones.') {
          state <- 'OHIO'
        }
        
      }
      
      if(length(state) > 1) {
        stop()
      }
      if(length(state) == 0) {
        state <- ''
      }
      if(is.na(state)) {
        state <- ''
      }
      if(state == 'WEST VIRGINIA  MICHAEL E. CAPUANO, MASSACHUSETTS') {
        state <- 'WEST VIRGINIA'
      }
      if(state == 'WEST VIRGINIA  RUBEN HINOJOSA, TEXAS') {
        state <- 'WEST VIRGINIA'
      }
      if(state == 'NORTH CAROLINA,  MAXINE WATERS, CALIFORNIA, RANKING') {
        state <- 'NORTH CAROLINA'
      }
      if(state %in% c('','FITZPATRICK,') & spkr == 'Mr. Fitzpatrick.') {
        state <- 'PENNSYLVANIA'
      }
      if(state == 'NORTH') {
        state <- 'NORTH CAROLINA'
      }
      if(state == 'WEST') {
        state <- 'WEST VIRGINIA'
      }
      if(spkr == 'Mr. Ross.' & doc == 'fed2002-07-17.txt') {
        state <- 'ARKANSAS'
      }
    }
    lookup <- bind_rows(lookup,data.frame(speaker = spkr,
                                          name = ifelse(length(name) == 0,spkr,name),
                                          state = state,stringsAsFactors = F))
  }
  
  text <- text %>% left_join(lookup,by = 'speaker')
  hearings <- bind_rows(hearings,text)
}


hearings <- hearings %>%
  mutate(ind = row_number())

# Only people without "states" are the Fed chairs and a number of experts
hearings %>%
  filter(state == '',
         !grepl('Greenspan|Mattingly|Bernanke|Yellen',speaker)) %>%
  select(speaker,state,docID) %>%
  distinct() %>%
  data.frame() %>% head()



# Check to see if anyone is "responding" to themselves: 70 instances in total
hearings %>%
  mutate(ind = row_number(),
         respondingTo = lag(speaker)) %>%
  filter(speaker == respondingTo)

# Manually inspect each and determine whether there is a mistake in the processing code above,
#   a typo in the original text, or just the simple case that utterances can be separated by 
#   things other than others' utterances (laughter, audio issues, recess, charts / presentations, etc.)
selfResponseInds <- hearings %>%
  mutate(respondingTo = lag(speaker)) %>%
  filter(speaker == respondingTo) %>%
  .$ind

selfResponseInds <- lapply(selfResponseInds,function(x) c(x-1,x))

# 1043:1045 are borked! What is happening here?
#   NF thinks (and JB agrees) that the middle utterance should be attributed to Sherman, not Greenspan. Fixing above. (NB: everything that is "fixed above" will not correspond to these indices after they are fixed)
# I think 4162:4164 should have Senator Bennett start talking again after Chairman Dodd says "without objection". If so, I'm going to modify the script above to insert the correct name at the start
#   of that line and allow the processing to run. 
# 5775:5776 seems to be an error due to a bathroom break and the additional indicator that Senator Carper is "presiding". I'm going to fix that above.
# 15133:15134 looks like the transcription just FORGOT YELLEN'S NAME! That's alarming. Fixing above. 
# 17049:17050 also looks like the transcription just forgot yellen! Fixing above again.
# 18640:18641 seems to have mistakenly indicated that Shelby was speaking when it was in fact Brown. Fixing above. 
(inds <- selfResponseInds[[44]])
cat(hearings %>%
      mutate(ind = row_number()) %>%
      filter(ind %in% inds) %>%
      .$text)

# The below are all unneccesary line breaks due to vote tallying / notifications of where to find prepared statements / laughter / quotes / charts / beginnings of statements. Combine
toComb <- list(c(selfResponseInds[[1]],selfResponseInds[[2]]),
               c(selfResponseInds[[3]],selfResponseInds[[4]]),
               selfResponseInds[[5]],selfResponseInds[[6]],selfResponseInds[[7]],
               selfResponseInds[[8]], # I think 1058:1059 is some weird division where Mr. Sherman asks an outlandish question, and then the transcript puts in a pause, after which he speaks again to thank the chair for his time.
               selfResponseInds[[9]],selfResponseInds[[10]],selfResponseInds[[11]],
               selfResponseInds[[12]],selfResponseInds[[13]],selfResponseInds[[14]],
               selfResponseInds[[15]],selfResponseInds[[16]],selfResponseInds[[17]],
               selfResponseInds[[18]],selfResponseInds[[19]],selfResponseInds[[20]],
               selfResponseInds[[21]],selfResponseInds[[22]],selfResponseInds[[23]],
               selfResponseInds[[24]],selfResponseInds[[25]],selfResponseInds[[26]],
               selfResponseInds[[27]],selfResponseInds[[28]],selfResponseInds[[29]],
               selfResponseInds[[30]],selfResponseInds[[31]],selfResponseInds[[32]],
               selfResponseInds[[33]],selfResponseInds[[34]],selfResponseInds[[35]],
               selfResponseInds[[36]],selfResponseInds[[37]],selfResponseInds[[38]],
               selfResponseInds[[39]],selfResponseInds[[40]],selfResponseInds[[41]],
               selfResponseInds[[42]],selfResponseInds[[43]],selfResponseInds[[44]],
               selfResponseInds[[45]],selfResponseInds[[46]],selfResponseInds[[47]],
               selfResponseInds[[48]],selfResponseInds[[49]],selfResponseInds[[50]],
               selfResponseInds[[51]],selfResponseInds[[52]],selfResponseInds[[53]],
               selfResponseInds[[54]],selfResponseInds[[55]],selfResponseInds[[56]],
               selfResponseInds[[57]],selfResponseInds[[58]],selfResponseInds[[59]],
               selfResponseInds[[60]],
               c(selfResponseInds[[61]],selfResponseInds[[62]],selfResponseInds[[63]]), # A sequence of Waters calling on people who don't reply
               selfResponseInds[[64]],selfResponseInds[[65]],selfResponseInds[[66]],
               selfResponseInds[[67]],
               c(selfResponseInds[[68]],selfResponseInds[[69]]), # Another sequence of Waters calling on absent people
               selfResponseInds[[70]])



for(i in 1:length(toComb)) {
  hearings$ind[which(hearings$ind %in% toComb[[i]])] <- toComb[[i]][1]
}

# So collapse these 68 offenders who are inaccurately separated, and remove the
#   name of the speaker from the text itself.
hearings <- hearings %>%
  group_by(docID,speaker,state,ind,name) %>%
  summarise(text = paste(gsub('^(M(s|r(s)*)\\.|Chair(wo)*man|Senator|Dr\\.) [[:alpha:]]+((\\s|-)[[:alpha:]]+){0,1}\\.( |-)|The Chairman\\.( |-)|The Clerk\\.( |-)','',text),collapse = '\n'),
            nTest = n(),.groups = 'drop') %>%
  arrange(ind)

hearings %>%
  mutate(respondingTo = lag(speaker)) %>%
  filter(speaker == respondingTo)

# Yay! No one is responding to themselves anymore! What a huge pain!

table(hearings$state)

hearings <- hearings %>%
  mutate(speaker = ifelse(state == 'BARNEY FRANK, Massachusetts, Chairman','Chairman Frank.',
                          ifelse(state == 'MICHAEL G. OXLEY, Ohio, Chairman','Chairman Oxley.',speaker)),
         state = ifelse(state == 'BARNEY FRANK, Massachusetts, Chairman','MASSACHUSETTS, Chairman',
                        ifelse(state == 'MICHAEL G. OXLEY, Ohio, Chairman','OHIO, Chairman',state)))

hearings$position <- hearings$state2 <- NA
for(i in 1:nrow(hearings)) {
  hearings$position[i] <- str_split(hearings$state[i],', ')[[1]][2]
  hearings$state2[i] <- gsub(',.*','',hearings$state[i])
}


hearings <- hearings %>%
  mutate(nchars = nchar(text)) %>%
  group_by(docID) %>%
  mutate(ind = row_number()) %>%
  ungroup()

# Remove all non-verbal textual information (laugher, recess, etc.)
hearings <- hearings %>%
  mutate(textclean = gsub('\\[.*?\\]','',text))

# Checking to see if we're missing speakers who should in fact be in there
inspects <- hearings %>%
  filter(is.na(speaker)) %>%
  group_by(docID) %>%
  summarise(n=n()) %>% filter(n != 2) %>% .$docID

# Looks good! Of those hearings where we have more than two NA speakers (meaning the preamble and the postscript), 
#   all of these appear to be additional technical details
hearings %>% 
  filter(docID %in% inspects) %>% 
  filter(is.na(speaker)) %>%
  filter(!grepl('^\\[|^,|^d statements,',text))

# Looking for errors in two people with the same name
test <- apply(table(hearings$speaker,hearings$state2),1,function(x) sum(x > 0))
test[which(test > 1)] # 6 speakers are associated with more than one state

# All of them are correct though! (based on manual inspection by JB). 
hearings %>%
  filter(speaker == 'Mr. Ross.') %>%
  select(speaker,state2,docID) %>%
  distinct()


# What about with the full names?
test <- apply(table(hearings$name,hearings$state2),1,function(x) sum(x > 0))
test[which(test > 1)] # No name is associated with more than one state


# What about people who aren't associated with states? They better be either Fed chairs or experts
# Confirmed!
hearings %>%
  filter(state2 == '') %>%
  select(speaker) %>%
  distinct()

chamberID <- hearings %>%
  group_by(docID) %>%
  summarise(senate = sum(grepl('Senat',speaker)))

# Interesting...it seems the senate goes first/last on odd years and last/first on even years
chamberID %>% data.frame()

hearings <- hearings %>%
  left_join(chamberID %>% mutate(chamber = ifelse(senate > 0,'Senate','House')) %>% select(docID,chamber))

# Check-point since this is going to require a human review
write.csv(hearings,file = './data/prepped/hearings/converted_docs_new.csv')


# Need to link everyone with their opensecretsID (god help us)
toFill <- hearings %>% filter(!is.na(speaker),state2 != '') %>% 
  select(state2,docID,name,chamber) %>% distinct() %>%
  mutate(year = lubridate::year(as.Date(gsub('fed|\\.txt','',docID)))) %>%
  select(state2,year,name,chamber) %>% distinct()

stateLookup <- data.frame(state = toupper(state.name),
                          stab = state.abb)
stateLookup <- bind_rows(stateLookup,
                         data.frame(state = 'GUAM',
                                    stab = 'GU'))

toFill <- toFill %>%
  left_join(stateLookup,by = c('state2' = 'state'))



# So the trick here will be a multi-stage merge, wherein we first just match based on full name, and then go through those we missed.
toCheck <- NULL
for(yr in unique(toFill$year)) {
  cycle <- floor(yr / 2)*2
  # Stage 1: Names only
  # Opening candidates (previous cycle)
  candsFull <- NULL
  for(cyc in c(cycle,(cycle+2))) {
    if(cyc == 2022) { next }
    candsFull <- bind_rows(candsFull,as_tibble(read.delim(paste0('./data/raw/donations/cands',substr(as.character(cyc),3,4),'.txt'),
                                  col.names = c('cycle','fecID','opensecretsID','FirstLastP','party','distIDrunfor','distIDcurr',
                                                'currCand','CycleCand','CRPICO','RecipCode','NoPacs'),sep = ',',quote = '|')))
    
  }
  
  cands <- candsFull %>%
    mutate(stab = substr(distIDcurr,1,2),
           name = toupper(gsub(' \\(.*','',FirstLastP))) %>%
    select(opensecretsID,FirstLastP,party,distIDcurr,stab,name) %>%
    distinct()
  
  merged <- toFill %>%
    mutate(nameCln = gsub('OCASIOCORTEZ','OCASIO-CORTEZ',trimws(gsub("[^[:alnum:] ]", "",name)))) %>%
    filter(year == yr) %>%
    left_join(cands,by = c('nameCln' = 'name','stab')) %>%
    filter(!is.na(FirstLastP))
  
  test <- merged %>%
    group_by(opensecretsID) %>%
    mutate(n=n()) %>% filter(n > 1)
  if(nrow(test) > 0) {
    merged <- merged %>%
      filter(!opensecretsID %in% test$opensecretsID) %>%
      bind_rows(toFill %>%
                  mutate(nameCln = gsub('OCASIOCORTEZ','OCASIO-CORTEZ',trimws(gsub("[^[:alnum:] ]", "",name)))) %>%
                  filter(year == yr) %>%
                  left_join(candsFull %>%
                              filter(opensecretsID %in% test$opensecretsID) %>%
                              filter(cycle <= yr) %>%
                              mutate(stab = substr(distIDcurr,1,2),
                                     name = toupper(gsub(' \\(.*','',FirstLastP)),
                                     year = yr) %>%
                              select(opensecretsID,FirstLastP,party,distIDcurr,stab,name) %>%
                              distinct(),by = c('nameCln' = 'name','stab')) %>%
                  filter(!is.na(FirstLastP)))
  }
  
  # So in the 2000 cycle data, we matched 32 out of 42 people on name only
  unmatched <- toFill %>%
    mutate(nameCln = gsub('OCASIOCORTEZ','OCASIO-CORTEZ',trimws(gsub("[^[:alnum:] ]", "",name)))) %>%
    filter(year == yr) %>%
    left_join(cands,by = c('nameCln' = 'name','stab')) %>%
    filter(is.na(FirstLastP)) %>%
    select(state2,name,year,stab,chamber)
  
  fixed <- NULL
  if(nrow(unmatched) > 0) {
    for(i in 1:nrow(unmatched)) {
      if(unmatched$chamber[i] == 'Senate') {
        ref <- cands %>%
          filter(stab == unmatched$stab[i],
                 substr(distIDcurr,3,3) == 'S')
      } else {
        ref <- cands %>%
          filter(stab == unmatched$stab[i],
                 grepl('\\d',substr(distIDcurr,3,3)))
      }
      
      dists <- stringdist(ref$name,unmatched$name[i])
      fixed <- bind_rows(fixed,
                         bind_cols(unmatched %>%
                                     slice(i),
                                   ref %>%
                                     slice(which.min(dists)) %>%
                                     select(-name,-stab) %>%
                                     mutate(dist = dists[which.min(dists)])))
      
    }
  }
  
  toCheck <- bind_rows(toCheck,bind_rows(fixed,merged)) %>%
    distinct()

  if(any(is.na(toCheck$speaker))) { stop() }
}


# This is going to require a human review...happy holidays to me
write.csv(toCheck,file = './data/prepped/hearings/toCheck_12_25_2021.csv')




# WELL THAT WAS HELL!
hearings <- read_csv('./data/prepped/hearings/converted_docs_new.csv',col_select = -1)
lookup <- read_csv('./data/prepped/hearings/toCheck_12_25_2021_jbfixed_new.csv',col_select = -1)

# Now need to make sure we don't have duplicates for a given year:
#   Fixed! (But it was hell)
lookup %>%
  group_by(opensecretsID,year) %>%
  mutate(n=n()) %>%
  filter(n > 1)


# But it seems that the merge works fine...the only people we don't have IDs for are
#   the Fed chairs and experts, neither of which we expect to have ids for anyway.
hearings %>%
  mutate(name = ifelse(name == 'BERNANKE' & state2 == '','BEN S. BERNANKE',
                       ifelse(name == 'GREENSPAN' & state2 == '','ALAN GREENSPAN',
                              ifelse(name == 'YELLEN' & state2 == '','JANET L. YELLEN',
                                     ifelse(name == 'POWELL' & state2 == '','JEROME H. POWELL',name))))) %>%
  left_join(lookup %>%
              select(-dist,-nameCln,-matches('jb'))) %>%
  filter(is.na(opensecretsID)) %>%
  group_by(name) %>%
  summarise(n=n())

merged <- hearings %>%
  mutate(name = ifelse(grepl('Bernanke',speaker),'BEN S. BERNANKE',
                       ifelse(grepl('Greenspan',speaker),'ALAN GREENSPAN',
                              ifelse(grepl('Yellen',speaker),'JANET L. YELLEN',
                                     ifelse(grepl('Powell',speaker),'JEROME H. POWELL',name)))),
         year = lubridate::year(as.Date(gsub('fed|\\.txt','',docID)))) %>%
  left_join(lookup %>%
              select(-dist,-nameCln,-matches('jb')))


merged %>%
  mutate(position = ifelse(grepl('Greenspan|Powell|Yellen|Bernanke',speaker) & is.na(state2),'Fed Chair',
                           ifelse(grepl('Calabria|Kohn|McCloskey|Koo|Mattingly|Meltzer|Mishel|Taylor',speaker) & is.na(state2),'Expert',
                                  ifelse(name == 'The Clerk.','Admin Support',
                                         ifelse(grepl('Chair',speaker),'Committee Chair','Legislator'))))) %>%
  mutate(speaker = gsub('Ms. Bachmann','Mrs. Bachmann',
                        gsub('Mr. Bernanke','Chairman Bernanke',
                             gsub('Ms. Biggert','Mrs. Biggert',
                                  gsub('Mr. Greenspan','Chairman Greenspan',
                                       gsub('Dr. Paul','Mr. Paul',
                                            gsub('Senator Paul S.','Senator Sarbanes.',
                                                 gsub('M(r)*s. Yellen','Chairwoman Yellen',
                                                      gsub(' of \\w+(\\s\\w+)*\\.','.',speaker))))))))) %>%
  mutate(speaker = ifelse(!is.na(name) & name == 'SUE W. KELLY','Mrs. Kelly.',speaker)) %>%
  filter(is.na(speaker))

merged <- merged %>%
  mutate(position = ifelse(grepl('Greenspan|Powell|Yellen|Bernanke',speaker) & is.na(state2),'Fed Chair',
                           ifelse(grepl('Calabria|Kohn|McCloskey|Koo|Mattingly|Meltzer|Mishel|Taylor',speaker) & is.na(state2),'Expert',
                                  ifelse(name == 'The Clerk.','Admin Support',
                                         ifelse(grepl('Chair',speaker),'Committee Chair','Legislator'))))) %>%
  mutate(speaker = gsub('Ms. Bachmann','Mrs. Bachmann',
                        gsub('Mr. Bernanke','Chairman Bernanke',
                             gsub('Ms. Biggert','Mrs. Biggert',
                                  gsub('Mr. Greenspan','Chairman Greenspan',
                                       gsub('Dr. Paul','Mr. Paul',
                                            gsub('Senator Paul S.','Senator Sarbanes.',
                                                 gsub('M(r)*s. Yellen','Chairwoman Yellen',
                                                      gsub(' of \\w+(\\s\\w+)*\\.','.',speaker))))))))) %>%
  mutate(speaker = ifelse(!is.na(name) & name == 'SUE W. KELLY','Mrs. Kelly.',speaker)) %>%
  mutate(opensecretsID = ifelse(is.na(opensecretsID) & position == 'Fed Chair',
                                paste0('FED',gsub('^.*? (\\w{2,})','\\1',name)),
                                ifelse(is.na(opensecretsID) & position == 'Expert',
                                       paste0('EXPERT',gsub('^.*? (\\w{2,})','\\1',toupper(gsub('\\.','',name)))),
                                       ifelse(position == 'Admin Support','ADMIN',opensecretsID))))
  

merged %>%
  filter(is.na(speaker))

# Everyone with an opensecretsID that shows up multiple times is either an 
#   expert, a Fed chair, or a legislator who switches chambers, parties, or hearing Chair.
#   There are also a few people who's recorded names were written slightly differently across hearings.
(dupes <- merged %>%
  select(speaker,name,position,chamber,party,stab,opensecretsID) %>%
  distinct() %>%
  arrange(name) %>%
  group_by(opensecretsID) %>%
  mutate(n=n()) %>%
  filter(n > 1) %>%
    data.frame()) %>%
  arrange(opensecretsID)

# So I think this is looking pretty good now. Saving to be linked with
# - children
# - bills
# - demographics
# - IG ratings
write.csv(merged,file = './data/prepped/hearings/cleaned_docs.csv')

# EOF